package com.casa.POI.dianping;


import java.io.IOException;
import java.io.InputStream;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.CompactHtmlSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyHtmlSerializer;
import org.htmlcleaner.Serializer;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class SpiderUtil {
	
	private static CleanerProperties htmlCleanerProps;
	private static HtmlCleaner htmlCleaner;
	
	static {
		htmlCleanerProps = new CleanerProperties();
		htmlCleanerProps.setTranslateSpecialEntities(true);
		htmlCleanerProps.setTransResCharsToNCR(true);
		htmlCleanerProps.setOmitComments(true);
		htmlCleaner = new HtmlCleaner(htmlCleanerProps);
	}
	
	public static TagNode cleanUrl(String urlString) throws IOException {
		return cleanInputStream(SmartConnection.getInstance().getInputStream(urlString));
	}
	
	public static TagNode cleanInputStream(InputStream is) throws IOException {
		return htmlCleaner.clean(is, "UTF-8");
	}
	
	public static void saveToFile(TagNode node, String fileName) throws IOException {
		getPrettySerializer().writeToFile(node, fileName, "UTF-8");
	}
	
	public static Serializer getCompactSerializer() {
		return new CompactHtmlSerializer(htmlCleanerProps);
	}
	
	public static Serializer getPrettySerializer() {
		return new PrettyHtmlSerializer(htmlCleanerProps);
	}
	
	public static void writeToFile(String urlString, String fileName) throws IOException {
		saveToFile(cleanUrl(urlString), fileName);
	}
	
	/**
	 * @param args
	 * @throws IOException 
	 * @throws XPatherException 
	 */
	public static void main(String[] args) throws IOException, XPatherException {
//		writeToFile("http://www.dianping.com/shop/5734289", "c:\\branch.html");
//		
//		getCompactSerializer().writeToFile(cleanUrl("http://www.dianping.com/shop/1898482"), "C:\\compact.html", "UTF-8");
//		
//		FileOutputStream fos = new FileOutputStream("c:\\pretty1.html");
//		getPrettySerializer().writeToStream(cleanUrl("http://www.dianping.com/shop/1898482"), fos, "UTF-8");
//		fos.close();
//		
//		fos = new FileOutputStream("c:\\compact1.html");
//		getCompactSerializer().writeToStream(cleanUrl("http://www.dianping.com/shop/1898482"), fos, "UTF-8");
//		fos.close();
		
	}

}
